############### ###############
## 01 - CBO group cleaning
## Project: CBO
## Author: Kamil Kouhen
## Purpose: Cleaning and management of CBO data at the group level
## Date of creation: 07/01/2022
############### ###############

# Notes #
# Search "Unsolved" in document for unsolved issues (@Malte)

library(here)
#Running master file and ad-hoc function rcodes
#source(here("Code", "Rcode", "Master.R"), echo = T) #Master (contains necessary packages)

  ## Informing what type of data the user has chosen ##
  message(paste0("You have chosen to work with: ", datatype))

# 1. Starting the cleaning ------ #

  ## Only keeping variables that are useful for CBO project analysis ##
  cbo_intermediate <- CBO_groups_raw %>%
    select(-question,
           -question_yn,
           -meetingtime,
           -contains(c(
                        "start_",
                        "begin",
                        "scan",
                        "phone_",
                        "participantphone",
                        "paymentsheet_confirm",
                        "idnum",
                        #Reflection exercise, not useful here
                        "survey_delay",
                        "survey_withdrawal")),
           -meeting)
  
  ### Useless here
  if ("idcommune_str" %in% (cbo_intermediate %>% colnames) == T) {
    #if variable exists
    cbo_intermediate %<>% select(-idcommune_str)
  }
  if (exists("cbo_intermediate$idcommune_strr") == T) {
    #if variable exists
    cbo_intermediate %<>% select(-idcommune_strr)
  }
  
  ## Checking blinded id variables for duplicates, NAs etc. ##
  
  #Checking no duplicates or NA exist in id variable
  if (length(unique(cbo_intermediate$appcode)) / nrow(cbo_intermediate) != 1) {
    stop(
      "Ad-hoc error message: There seems to be duplicates in the id variable, 
       please check for duplicates again"
    )
  }
  
  if (sum(is.na(cbo_intermediate$appcode)) != 0) {
    stop(
      "Ad-hoc error message: There seems to be missing values in the id variable, 
       please resolve."
    )
  }
  
  ## Per the "manuel des agents, there needs to be 662 different CBO" ##
  if (length(unique(cbo_intermediate$appcode)) != 662) {
    nbCBO <- length(unique(cbo_intermediate$appcode))
    message(nbCBO, " CBOs are ientified instead of 662")
    rm(nbCBO)
    print(
      "Ad-hoc error message: 662 CBOs should be identified, `
      check why the number of groups (appcode) here is different."
    )
    print("4 missing CBOs due to technical difficulties at data collection")
  }
  
    #Note# 4 missing CBOs due to technical difficulties at data collection
  
  ## Checking region variable: there needs to be 13 regions 
   # per the "Manuel des agents de collecte" ##
  if (sum(is.na(cbo_intermediate$region)) != 0) {
    stop(
      "Ad-hoc error message: There seems to be missing values in 
      the region variable, please resolve."
    )
  }
  
  if (length(unique(cbo_intermediate$region)) != 13) {
    nbregion <- length(unique(cbo_intermediate$region))
    message(nbregion, " Regions are ientified instead of 13")
    stop("Ad-hoc error message: Please check why there are not 13 regions in the data")
  }
  
  
  ### To do so, I create a separate dataset only containing 
    # completely surveyed CBOs (and 2 partial ones)
  if (sum(is.na(cbo_intermediate$survey_complete)) != 0) { #no missing value
    stop(
      "Ad-hoc error message: There seems to be missing values in the survey_complete variable, please resolve."
    )
  }
  table(cbo_intermediate$survey_complete) #586 complete, 70 not surveyed, 2 partial

  cbo_intermediate_untouched <- cbo_intermediate %>% #For safety and random checks of integrity
    filter(survey_complete != "notsurveyed")
  
  ## Removing unsurveyed CBOs ##
  cbo_intermediate %<>%
    filter(survey_complete != "notsurveyed")
  
  ## sg_name	mayor_name, are all non-anonymised 
   # putting them all to NA ##
  cbo_intermediate %<>%
    dplyr::mutate(sg_name = NA, mayor_name = NA) #All values as NA
  
    # Note # Party name can be useful, I am leaving it. But I am dropping sg_name and mayor_name (approved by Malte)
  
  ## Putting all id variables that are numeric into character ##
  tocharacter <- cbo_intermediate %>%
    select(commune, region, contains("link")) %>% colnames
  cbo_intermediate[tocharacter] <-
    lapply(cbo_intermediate[tocharacter], as.character)
  rm(tocharacter)
  
  ## Identifying chr variables that should be numeric (all values are numeric or "NA") 
   # and putting them as numeric ##
  charactervars <- cbo_intermediate %>%
    select_if(is.character) %>%
    select(-region, -commune, -appcode) %>% #Not converting id and time vars
    colnames()
  
  ### Converting if all is numeric
  cbo_intermediate %<>%
    dplyr::mutate(across(all_of(charactervars), function(x)
      Hmisc::all.is.numeric(x, what = "vector", extras = NA)))
  
  ## Identifying all-missing numeric variables ##
  if (length(cbo_intermediate %>% select_if(function(x)
    all(is.na(x))) %>% colnames()) == 0) {
    print("None of the numeric variables is all missing")
  } else {
      print("The following variables are all missing, please take care of them:")
      print(cbo_intermediate %>% select_if(function(x)
        all(is.na(x))) %>% colnames())
        }
  
  #By importing a dta document, some variables are categorised as labelled numeric vars, we put them as factor and use value labels as factor levels when possible
  labelledvars <- cbo_intermediate %>%
      select_if(is.numeric) %>%
      select_if(is.labelled) %>%
      colnames()
  
  cbo_intermediate[labelledvars] <-
    lapply(cbo_intermediate[labelledvars], function(x)
      haven::as_factor(x, levels = "default")) #It seems to be working (e.g. comp3)
  rm(labelledvars)
  
  ## Identifying variables to be recoded as factor 
   # (e.g. if it contains a Yes/No pattern) ##
  prefactor <- cbo_intermediate %>%
    select_if(
      grepl("yes|no|Yes|No|YES|NO|oui|non|Oui|Non|OUI|NON", cbo_intermediate)) %>%
    select_if(function(x) all(max(nchar(x)) < 5)) %>%
    colnames() 
  #map(prefactor, unique) #The only non "yes" or "no" string is "dk" for "don't know". I leave them for now. 
  cbo_intermediate[prefactor] <-
    lapply(cbo_intermediate[prefactor], factor) 
  
  ## Writing yes as "1", no as "0" ##
  tochange <- cbo_intermediate %>% 
    select_if( ~ is.factor(.) && any(c("yes") %in% levels(.))) %>% 
    colnames()
  
  cbo_intermediate %<>%
    mutate_at(
      .vars = vars(all_of(tochange)),
      .funs = forcats::fct_recode,
      "1" = "yes"
    )
  
  tochange <- cbo_intermediate %>% 
    select_if( ~ is.factor(.) && any(c("no") %in% levels(.))) %>% colnames()
  cbo_intermediate <-
    cbo_intermediate %>%
    mutate_at(
      .vars = vars(all_of(tochange)),
      .funs = forcats::fct_recode,
      "0" = "no"
    )
  rm(tochange)
  
  #Identifying binary numeric vars that should be factors and converting them
  binary01_should_be_factor <- cbo_intermediate %>%
      select_if(is.numeric) %>%
      select_if( ~ max(., na.rm = TRUE) == 1) %>%
      select_if( ~ min(., na.rm = TRUE) >= 0) %>%
      select_if(function(x)
        (length(unique(na.omit(x))) <= 2)) %>% ##3 counting the NAs
      colnames() #34 vars in this case
  
  binary12_should_be_factor <- cbo_intermediate %>%
      select_if(is.numeric) %>%
      select_if( ~ max(., na.rm = TRUE) == 2) %>%
      select_if( ~ min(., na.rm = TRUE) >= 1) %>%
      select_if(function(x)
        (length(unique(na.omit(x))) <= 2)) %>% ##3 counting the NAs
      colnames() #2 vars in this case
  
  ### Converting to factor
  cbo_intermediate[binary01_should_be_factor] <-
    lapply(cbo_intermediate[binary01_should_be_factor], factor)
  cbo_intermediate[binary12_should_be_factor] <-
    lapply(cbo_intermediate[binary12_should_be_factor], factor)
  
  rm(binary01_should_be_factor, binary12_should_be_factor) #Cleaning the environment
  
### Random check of integrity of dataset ###
if (nrow(cbo_intermediate_untouched) != nrow(cbo_intermediate))
  stop(
    "Something went wrong: some observations were dropped since the first creation of cbo_intermediate."
  )
if (length(cbo_intermediate_untouched %>% colnames) > length(cbo_intermediate %>% colnames))
  stop(
    "Something went wrong: some variables were dropped since the first creation of cbo_intermediate."
  )
###                                      ####
  
  ## Identifying numeric variables that should be categorical (factor) 
   # using the questionnaire ##
  shouldbefactor <-
    (as_tibble(readxl::read_excel(
      here(
        "Supporting Documents",
        "IPA deliverables",
        "3_Questionnaires finaux OCB & DECIDEURS",
        "OCB_Questionnaire_endline_group_test16.xlsx"))) %>% #Importing questionnaire
    ###Selecting those most likely to be categ.
    filter(if_all(type, ~ grepl('select', .))) %>%
    filter(!(if_all(type, ~ grepl('phone', .)))) %>%
    select(name) %>%
    filter(!(name %in% c( #Removing survey variables
                          "id",
                          "region",
                          "commune",
                          "enumerator",
                          "team",
                          "supervisor"))))$name #Only using name columns
  
  ### Matching variable list (that are not factors already) in 
    # dataset with shouldbefactor
  shouldbefactor <-
    subset(shouldbefactor, shouldbefactor %in% (
        cbo_intermediate %>% select_if( ~ !is.factor(.x)) %>% colnames())) #8 new factor variables
  
  #Transforming all these variables as factor if they are not already
  cbo_intermediate[shouldbefactor] <-
    lapply(cbo_intermediate[shouldbefactor], factor)
  
  rm(shouldbefactor, prefactor) #Cleaning the environment
  
  ## Using questionnaire to label variables ##
  varlabels <-
    as_tibble(readxl::read_excel(
      here(
        "Supporting Documents",
        "IPA deliverables",
        "3_Questionnaires finaux OCB & DECIDEURS",
        "OCB_Questionnaire_endline_group_test16.xlsx"
      ),
      sheet = "survey"
    )) %>% #Importing questionnaire
    select(name, "label::English") %>%
    rename(label = "label::English") %>%
    filter(name %in% colnames(cbo_intermediate))
  
  ## Reshaping: the goal is to have a table with a column 
   # for each variable and the first row being the label. ##
  varlabels <-
    as_tibble(cbind(nms = names(varlabels), t(varlabels))) %>% 
    janitor::row_to_names(row_number = 1) %>% #First row as variable name
    select(-name)
  
  # Note # Please ignore the following warning 
         # (should stop after first run, this warning is displayed once every 8 hours): 
         # The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` 
         # is omitted as of tibble 2.0.0.
  
  cbo_intermediate <-
    Hmisc::upData(cbo_intermediate, labels = varlabels) #It worked (variables are labelled)
  rm(varlabels)
  
  ## Missing characters as NA ##
  cbo_intermediate %<>%
    mutate(across(all_of(
      cbo_intermediate %>% 
               select_if(is.character) %>% 
               colnames), 
      ~ifelse(.=="", 
               NA, 
               as.character(.))))
  
  
  ## Looking for non standard missing values in numeric variables (negative such as -99 or -1)
  cbo_intermediate %>%
    select_if(is.numeric) %>%
    select_if( ~ max(., na.rm = TRUE) < 0) %>%
    colnames()
  
  #Safe
  if (length(
    cbo_intermediate %>% 
    select_if(is.numeric) %>% 
    select(-ends_with("_BL"), -ends_with("_S"), -ends_with("_MON")) %>%
    keep( ~ any(.x < 0 & !is.na(.x))) %>% 
    select(-contains("recordgps")) %>% #Removing gps coordinates
    names()
  ) > 0)
    stop(
      "Ad-hoc: Please check if there are variables in this case with negative values (potential missing value) that were not spotted before)."
    )
  
  # Note # No numeric variable with negative value.
  
  #Looking for non standard missing values in factor variables (negative such as -99 or -1)
  if (length(cbo_intermediate %>%
             select(-ends_with("_BL"), -ends_with("_S"), -ends_with("_MON")) %>%
             select_if( ~ is.factor(.) &&
                        any(
                          c(
                            "-1",
                            "-2",
                            "-3",
                            "-97",
                            "-98",
                            "-99",
                            "-997",
                            "-998",
                            "-999",
                            " ",
                            "."
                          ) %in% levels(.)
                        )) %>% colnames()) > 0)
  stop("Ad-hoc: some factor variables contain unexpected missing values such as -99")
  
  # Note # No misrecorded factor not empty (with space) factor level
  
  ### Random check of integrity of dataset ###
  if (nrow(cbo_intermediate_untouched) != nrow(cbo_intermediate))
    stop(
      "Something went wrong: some observations were dropped since the first creation of cbo_intermediate."
    )
  if (length(cbo_intermediate_untouched %>% colnames) > length(cbo_intermediate %>% colnames))
    stop(
      "Something went wrong: some variables were dropped since the first creation of cbo_intermediate."
    )
  ###                                      ###
  
  
  #Looking for misrecorded missing values: replacing empty cells by NA
  length(which(cbo_intermediate == "")) #12482
  cbo_intermediate <-
    cbo_intermediate %>%
    mutate(across(
      all_of(
        cbo_intermediate %>% select_if(is.character) %>% colnames
      ),
      ~ ifelse(. == "", NA, as.character(.))
    ))
  length(which(cbo_intermediate == ".")) #0
  
  #Ad-hoc function to create report with share of NAs for each variable in dataframe
  share_NAs(cbo_intermediate) #File exported in here("Output", "For Cleaning")
  
  ### Random check of integrity of dataset ###
  if (nrow(cbo_intermediate_untouched) != nrow(cbo_intermediate))
    stop(
      "Something went wrong: some observations were dropped since the first creation of cbo_intermediate."
    )
  if (length(cbo_intermediate_untouched %>% colnames) > length(cbo_intermediate %>% colnames))
    stop(
      "Something went wrong: some variables were dropped since the first creation of cbo_intermediate."
    )
  ###                                      ####
  
  #Categorizing variables for easier analysis
  surveyvars <- cbo_intermediate %>% 
    select(
      region:appcode,
      remark,
      contains("link"),
      situation,
    ) %>% 
    colnames()
  
  cbo_preinfo <- cbo_intermediate %>% 
    select(pgg_endowment:participants_total, contains("survey_"),) %>% 
    colnames()
  
  cbo_activism_int <- cbo_intermediate %>% 
    select(meetings_general:members_eligibility) %>% 
    colnames()
  
  cbo_leadership <- cbo_intermediate %>% 
    select(leadership_title:leadership_candidates) %>% 
    colnames()
  
  cbo_finance <- cbo_intermediate %>% 
    select(budget_any:contributions_nonpayment) %>% 
    colnames()
  
  covid_and_secu <- cbo_intermediate %>% 
    select(covid_discussed:security_adaptation) %>% 
    colnames()
  
  activism <- cbo_intermediate %>% 
    select(meet_cm:meet_village) %>% 
    colnames()
  
  recip <- cbo_intermediate %>% 
    select(recip_any:recip_details) %>% 
  colnames()
  
  research <- cbo_intermediate %>% 
    select(research_data:research_interviews_detail) %>% 
    colnames()
  
  organize <- cbo_intermediate %>% 
    select(organize_villagemeeting:organize_socialmedia) %>% 
    colnames()
  
  pressure <- cbo_intermediate %>% 
    select(pressure_campaign:pressure_protest_num) %>% 
    colnames()
  
  gov_effacite <- cbo_intermediate %>% 
    select(efficacite1:efficacite4) %>% 
    colnames()
  
  local_pol <- cbo_intermediate %>% 
    select(sg_name:partyname10) %>%
    colnames()
  
  knowledge_local_pol <- cbo_intermediate %>% 
    select(know1:know7) %>% 
    colnames()
  
  decision_ex <- cbo_intermediate %>% 
    select(groupesize:participant_amount8) %>% 
    colnames()
  
  baselinevars <- cbo_intermediate %>% 
    select(ends_with("_BL")) %>% 
    colnames()
  
  monitoringvars <- cbo_intermediate %>% 
    select(ends_with("_MON")) %>% 
    colnames()
  
  supermunvars <- cbo_intermediate %>% 
    select(ends_with("_S")) %>% 
    colnames()
  
  ### Checking variable distributions: starting with cbo_preinfo ###
  sumstats(cbo_intermediate[cbo_preinfo])
  
  ### Checking variable distributions: cbo_activism_int ###
  sumstats(cbo_intermediate[cbo_activism_int])
  
  #meetings_executive
  #summary(cbo_intermediate$meetings_executive)
  #boxplot(cbo_intermediate$meetings_executive) #Q3 at 9 but still a quarter of obs above 9 and a max at 48. Nothing too shocking (potentially possible).
  
  #meetings_general
  #summary(cbo_intermediate$meetings_general)
  #boxplot(cbo_intermediate$meetings_general) #Even more extreme here: Q3 at 5, max at 60
  cor(
    cbo_intermediate$meetings_general,
    cbo_intermediate$meetings_executive
  ) #0.32
  
  #members_died
  #summary(cbo_intermediate$members_died)
  #boxplot(cbo_intermediate$members_died) #Even more extreme here: Q3 at 5, max at 60
  nrow(cbo_intermediate %>% filter(members_died > 3)) / nrow(cbo_intermediate)
  nrow(cbo_intermediate %>% filter(members_died > 5))
  nrow(cbo_intermediate %>% filter(members_died > 10))
  # Note # very weird numbers for members_died: 8% with more than 3 members who died (2020 so year of pandemic)
  # but 28 CBOs with more than 5 dead members and 5 with more than 10 (one with 60!)
  
  #Using ad-hoc "flagging" function to flag spotted values of concern, export the info to a .xlsx file called "Flagged_values" in "For Cleaning"
  #Creating vector with ids of CBOs with high number of dead members
  # highdeath <-
  #   cbo_intermediate %>% filter(members_died > 5) %>% select(
  #     appcode,
  #     members_died,
  #     remark,
  #     covid_prevalence,
  #     security_disruption,
  #     security_adaptation
  #   )
  # 
  # table(highdeath$covid_prevalence)
  # table(highdeath$security_disruption, highdeath$members_died) #Interesting: the CBOs with 17, 20 and 60 dead members have answered "considerable disrupted" for the impact of the security situation
  # table(highdeath$security_adaptation, highdeath$members_died) #It might actually be the case that more than 10 members died
  # 
  # for (i in unique(highdeath$appcode)) {
  #   ##Used for loop inside of map because map only saves the last iteration for some reason
  #   idss <- paste0(i)
  #   flagging(
  #     df = cbo_intermediate,
  #     selected.variable = members_died,
  #     selected.id = idss,
  #     as.NA = FALSE,
  #     #Not turning into NA for now
  #     remarks = "High number of dead members: the CBOs with 17, 20 and 60 dead members have answered considerable disrupted for the impact of the security situation. These very high numbers might actually be valid. Please check the code for more detail."
  #   )
  #   rm(idss)
  # }
  # rm(highdeath)
  # 
  #members_joined
  #summary(cbo_intermediate$members_joined)
  #boxplot(cbo_intermediate$members_joined) #Very high number in some cases (mean > Q3, median = 230)
  sum(cbo_intermediate$members_joined > 5) / nrow(cbo_intermediate) #But still 39% of obs with more than 5 new members
  #hist(cbo_intermediate$members_joined)
  #plot(density(cbo_intermediate$members_joined))
  
  # high_joined <-
  #   cbo_intermediate %>% filter(members_joined > 15)
  # for (i in unique(high_joined$appcode)) {
  #   #Flagging those with higher number of new number than the mean (15)
  #   idss <- paste0(i)
  #   flagging(
  #     df = cbo_intermediate,
  #     selected.variable = members_joined,
  #     selected.id = idss,
  #     as.NA = FALSE,
  #     #Not turning into NA for now
  #     remarks = "High number of new members (> 15), very high variance in this variable. Worth a check."
  #   )
  #   rm(idss)
  # }
  # rm(high_joined)
  
  #members_left
  #summary(cbo_intermediate$members_left)
  #boxplot(cbo_intermediate$members_left) #Similar case here: most == 0, 17% above zero, max at 180
  sum(cbo_intermediate$members_left > 0) / nrow(cbo_intermediate)
  quantile(cbo_intermediate$members_left,
           c(0.95, 0.97, 0.99))
  #plot(density(cbo_intermediate$members_left))
  
  # high_left <-
  #   cbo_intermediate %>% filter(members_left > 10)
  # for (i in unique(high_left$appcode)) {
  #   #Flagging those with higher number of new number than the mean (15)
  #   idss <- paste0(i)
  #   flagging(
  #     df = cbo_intermediate,
  #     selected.variable = members_left,
  #     selected.id = idss,
  #     as.NA = FALSE,
  #     #Not turning into NA for now
  #     remarks = "High number of members who left (> 10). Max is 180."
  #   )
  #   rm(idss)
  # }
  # rm(high_left)
  
  ### Checking variable distributions: cbo_leadership ###
  sapply(cbo_intermediate[cbo_leadership], class)
  
  #leadership_candidates (probably not important for analysis)
  table(cbo_intermediate$leadership_title, useNA = "always")
  
  #leadership_selection (probably not important for analysis)
  table(cbo_intermediate$leadership_selection,
        useNA = "always") #All seems to be good
  
  #leadership_selection_other (probably not important for analysis)
  
  #leadership_selection_year (probably not important for analysis)
  table(cbo_intermediate$leadership_selection_year,
        useNA = "always") #All seems to be good
  sum(cbo_intermediate$leadership_selection_year >= 2018) /
    nrow(cbo_intermediate)
  sum(cbo_intermediate$leadership_selection_year == 2019) /
    nrow(cbo_intermediate)
  sum(cbo_intermediate$leadership_selection_year == 2020) /
    nrow(cbo_intermediate)
  
  # Note # Lots of CBOs with last election since 2018 (57%), most in 2019 (22%), 2020 (11%)
  
  #leadership_candidates
  # table(cbo_intermediate$leadership_candidates,
  #       useNA = "always") #1 very high value (2011)
  # idinquestion <-
  #   cbo_intermediate %>% filter(leadership_candidates == 2011) %>% select(appcode)
  # idinquestion <- as.character(unique(idinquestion$appcode))
  # flagging(
  #   df = cbo_intermediate,
  #   selected.variable = members_left,
  #   selected.id = idinquestion,
  #   as.NA = FALSE,
  #   #Not turning into NA for now
  #   remarks = "Vary high number of leadership candidates (2011) while the second highest is 10."
  # )
  # table(
  #   cbo_intermediate$leadership_candidates,
  #   cbo_intermediate$leadership_selection_year
  # )
  # Note # Election with 2011 candidate was in 2020
  
  ### Checking variable distributions: cbo_finance ###
  sumstats(cbo_intermediate[cbo_finance]) #Ad-hoc function for simple #summary stats
  
  # Note #Be careful: some NAs in here
  
  #budget_any
  epiDisplay::tab1(cbo_intermediate$budget_any,
                   graph = FALSE,
                   missing = TRUE) #Simple frequency table with percentages (with simple graph)
  
  # Note # 15% of CBOs with no budget at all
  
  #budget_total: very high variance (sd = 12m)
  sum(
    is.na(cbo_intermediate$budget_total) &
      cbo_intermediate$budget_any == 0
  ) / sum(is.na(cbo_intermediate$budget_total))
  
  # Note # The CBOs with no budget (according to budget_any) have NA recorded in budget_total, I change it to 0
  cbo_intermediate <-
    cbo_intermediate %>%
    dplyr::mutate(budget_total = ifelse(budget_any == 0, 0, budget_total))
  reporting.changes(cbo_intermediate, "budget_total", describe = "NA to 0 for CBOs with no budget according to budget_any. No NA left in this variable.")
  
  #budget_total: checking high values
  #cbo_intermediate %>% ggplot(aes(x = budget_total)) + geom_density() #Crazy skewness
  #cbo_intermediate %>% filter(budget_total > median(budget_total)) %>% ggplot(aes(x = budget_total)) + geom_density() #Crazy skewness
  sum(cbo_intermediate$budget_total > (
    mean(cbo_intermediate$budget_total) + 1 * sd(cbo_intermediate$budget_total)
  )) #How many CBOs above mean + one sd: only 33
  quantile(cbo_intermediate$budget_total, 0.75, na.rm = TRUE)
  quantile(cbo_intermediate$budget_total, 0.9, na.rm = TRUE)
  
  #Flagging the top 10% of values (over 5m of budget) as high values
  # highbudget <-
  #   cbo_intermediate %>% filter(budget_total >= quantile(budget_total, 0.9, na.rm =
  #                                                                         TRUE))
  # 
  # for (i in unique(highbudget$appcode)) {
  #   ##Used for loop inside of map because map only saves the last iteration for some reason
  #   idss <- paste0(i)
  #   flagging(
  #     df = cbo_intermediate,
  #     selected.variable = budget_total,
  #     selected.id = idss,
  #     as.NA = FALSE,
  #     #Not turning into NA for now
  #     remarks = "Very high budget (over 5m - top 10%), max is 200m, median is 250k. Check if consistent."
  #   )
  #   rm(idss)
  # }
  # rm(highbudget)
  
  #contributions_any, _total, _late, _nonpayment
  epiDisplay::tab1(
    cbo_intermediate$contributions_any,
    graph = FALSE,
    missing = TRUE
  ) #Simple frequency table with percentages (with simple graph)
  
  sum(
    is.na(cbo_intermediate$contributions_total) &
      cbo_intermediate$contributions_any == 0
  ) / sum(is.na(cbo_intermediate$contributions_total))
  sum(
    is.na(cbo_intermediate$contributions_late) &
      cbo_intermediate$contributions_any == 0
  ) / sum(is.na(cbo_intermediate$contributions_late))
  sum(
    is.na(cbo_intermediate$contributions_nonpayment) &
      cbo_intermediate$contributions_any == 0
  ) / sum(is.na(cbo_intermediate$contributions_nonpayment))
  
  # Note # 19% of CBOs with no contributions from members in 2020. Same problem with missing values as for budget_total, changing to 0 when applicable.
  cbo_intermediate %<>%
    dplyr::mutate(
      contributions_total = ifelse(contributions_any == 0, 0, contributions_total),
      contributions_late = ifelse(contributions_any == 0, 0, contributions_late),
      contributions_nonpayment = ifelse(contributions_any == 0, 0, contributions_nonpayment)
    )
  
  reporting.changes(cbo_intermediate,
                    "contributions_total",
                    describe = "NA to 0 for CBOs with no contributions according to contributions_any No NA left in this variable.")
  reporting.changes(cbo_intermediate,
                    "contributions_late",
                    describe = "NA to 0 for CBOs with no contributions according to contributions_any No NA left in this variable.")
  reporting.changes(cbo_intermediate,
                    "contributions_nonpayment",
                    describe = "NA to 0 for CBOs with no contributions according to contributions_any No NA left in this variable.")
  
  #contributions_total: very high values too here in some cases
  #cbo_intermediate %>% 
    #ggplot(aes(x = contributions_total)) + geom_density() #Crazy skewness too
  #cbo_intermediate %>% filter(budget_total > median(contributions_total)) %>% 
    #ggplot(aes(x = contributions_total)) + geom_density() #Crazy skewness
  
  sumstats(cbo_intermediate$contributions_total) #Mean > Q3
  quantile(cbo_intermediate$contributions_total,
           0.99,
           na.rm = TRUE)
  
  # #Flagging the top 1% of values (over 4.5m of budget) as high values (max is 20m!)
  # highcontributions <-
  #   cbo_intermediate %>% filter(contributions_total >= quantile(contributions_total, 0.99, na.rm =
  #                                                                                TRUE))
  # 
  # for (i in unique(highcontributions$appcode)) {
  #   ##Used for loop inside of map because map only saves the last iteration for some reason
  #   idss <- paste0(i)
  #   flagging(
  #     df = cbo_intermediate,
  #     selected.variable = contributions_total,
  #     selected.id = idss,
  #     as.NA = FALSE,
  #     #Not turning into NA for now
  #     remarks = "Very high contributions (over 4.5m - top 1%), max is 20m!, median is 75k. Check if consistent."
  #   )
  #   rm(idss)
  # }
  # rm(highcontributions)
  
  sumstats(cbo_intermediate[cbo_finance]) #Ad-hoc function for simple #summary stats
  
  ### Checking variable distributions: covid_and_secu ###
  cbo_intermediate[covid_and_secu] %>% sumstats #All factors (two binary: _discussed and _policy) and one numeric (_prevalence)
  
    # Note # Only 2 CBOs with covid cases (covid_prevalence)
  
  #security_disruption
  epiDisplay::tab1(
    cbo_intermediate$security_disruption,
    graph = FALSE,
    missing = TRUE
  ) #Simple frequency table
  
  #security_adaptation
  epiDisplay::tab1(
    cbo_intermediate$security_adaptation,
    graph = FALSE,
    missing = TRUE
  ) #Simple frequency table
  
  # Note # 33.8% of CBOs with "Considerably disrupted" activities and 4.9% "completely suspended"
  # 70% at least "Slightly disrupted"!
  
  ### Checking variable distributions: activism ###
  sumstats(cbo_intermediate[activism]) #Mostly factors but also some numeric. SOME NAs
  
  #Checking variables with NAs: For all these missing values, an NA means a 0 for the _any variable, they should therefore be put as zero
  #Except when the _any variable is marked as "Don't Know"
  table(
    cbo_intermediate$meet_sg_any,
    cbo_intermediate$meet_sg_num,
    useNA = "always"
  )
  table(
    cbo_intermediate$meet_sg_any,
    cbo_intermediate$meet_sg_requested,
    useNA = "always"
  )
  table(
    cbo_intermediate$meet_admin_any,
    cbo_intermediate$meet_admin_num,
    useNA = "always"
  )
  table(
    cbo_intermediate$meet_admin_any,
    cbo_intermediate$meet_admin_requested,
    useNA = "always"
  )
  table(
    cbo_intermediate$meet_mayor_any,
    cbo_intermediate$meet_mayor_num,
    useNA = "always"
  )
  table(
    cbo_intermediate$meet_mayor_any,
    cbo_intermediate$meet_mayor_requested,
    useNA = "always"
  )
  table(
    cbo_intermediate$meet_councilor_any,
    cbo_intermediate$meet_councilor_num,
    useNA = "always"
  )
  table(
    cbo_intermediate$meet_councilor_any,
    cbo_intermediate$meet_councilor_opposition,
    useNA = "always"
  )
  table(
    cbo_intermediate$meet_councilor_any,
    cbo_intermediate$meet_councilor_committee,
    useNA = "always"
  )
  table(
    cbo_intermediate$meet_councilor_any,
    cbo_intermediate$meet_councilor_requested,
    useNA = "always"
  )
  
  #Reporting the changes between making them
  for (i in (as_vector(
    cbo_intermediate[activism] %>% select_if( ~ mean(is.na(.)) > 0) %>% colnames
  ))) {
    vari <- paste0(i)
    reporting.changes(
      cbo_intermediate,
      selected.variable = vari,
      describe = "NA to 0. For all these missing values, an NA means a 0 for the _any variable, they should therefore be put as zero. Except when the _any variable is marked as Don't Know"
    )
  }
  
  cbo_intermediate %<>%
    dplyr::mutate(
      meet_sg_num = ifelse(meet_sg_any == 0, 0, meet_sg_num),
      meet_sg_requested =  as.factor(as.character(ifelse(
        meet_sg_any == 0, 0, as.character(meet_sg_requested)
      ))),
      #need to write it like that so that R does not change factor levels
      meet_admin_num = ifelse(meet_admin_any == 0, 0, meet_admin_num),
      meet_admin_requested =  as.factor(as.character(
        ifelse(meet_admin_any == 0, 0, as.character(meet_admin_requested))
      )),
      meet_mayor_num = ifelse(meet_mayor_any == 0, 0, meet_mayor_num),
      meet_mayor_requested =  as.factor(as.character(
        ifelse(meet_mayor_any == 0, 0, as.character(meet_mayor_requested))
      )),
      meet_councilor_num = ifelse(meet_councilor_any == 0, 0, meet_councilor_num),
      meet_councilor_opposition =  as.factor(as.character(
        ifelse(
          meet_councilor_any == 0,
          0,
          as.character(meet_councilor_opposition)
        )
      )),
      meet_councilor_committee =  as.factor(as.character(
        ifelse(
          meet_councilor_any == 0,
          0,
          as.character(meet_councilor_committee)
        )
      )),
      meet_councilor_requested =  as.factor(as.character(
        ifelse(
          meet_councilor_any == 0,
          0,
          as.character(meet_councilor_requested)
        )
      )),
    )
  
  #Checking means of factor variables disregarding don't Know
  cbo_intermediate[activism] %>% sumstats #Only NAs left are due to "Don't Know" answers
  
  table(
    cbo_intermediate$meet_cm_invited,
    cbo_intermediate$meet_cdc_invited,
    cbo_intermediate$meet_admin_any
  )
  
  #Some extreme values in some cases
  cbo_intermediate[activism] %>% select_if(is.numeric) %>% sumstats
  cbo_intermediate %>%
    select(
      appcode,
      meet_sg_num,
      meet_admin_num,
      meet_mayor_num,
      meet_councilor_num,
      meet_village
    ) %>%
    filter(meet_sg_num >= 100)
  
  # Note # There are two observations with an abnormally high number of different types of meetings (meet_sg_num = 200)
  
  #I flag those with number over 75
  # lotsofmeetingsvec <- cbo_intermediate[activism] %>%
  #   select_if(is.numeric) %>%
  #   filter_all(any_vars(. > 75)) %>% #Any CBO with at least one number of meeting over 75
  #   select_if( ~ max(.) > 75, na.rm = TRUE) %>% colnames #Any variable with max above 75
  # lotsofmeetings <- cbo_intermediate %>%
  #   select(appcode, all_of(lotsofmeetingsvec)) %>% #Using id to identify observations
  #   filter(if_any(all_of(lotsofmeetingsvec), ~ .x > 75)) #Only keeping observations with at least one var in this case
  # 
  # for (i in unique(lotsofmeetings$appcode)) {
  #   ##Used for loop inside of map because map only saves the last iteration for some reason
  #   idss <- paste0(i)
  #   flagging(
  #     df = cbo_intermediate,
  #     selected.variable = "_sg_num, _admin_num, _mayor_num, _councilor_num, _meet_village",
  #     selected.id = idss,
  #     as.NA = FALSE,
  #     #Not turning into NA for now
  #     remarks = "High number of meetings: observation with at least 75 meetings in 2019 and 2020 for one category."
  #   )
  #   rm(idss)
  # }
  # rm(lotsofmeetingsvec, lotsofmeetings)
  
  #meet_education_0, meet_health_0 and meet_water_0 are very confusing, changing the order and definition
  cbo_intermediate <-
    cbo_intermediate %>%
    mutate(
      meet_education_any = as.factor(ifelse(meet_education_0 == 0, 1, 0)),
      meet_health_any = as.factor(ifelse(meet_health_0 == 0, 1, 0)),
      meet_water_any = as.factor(ifelse(meet_water_0 == 0, 1, 0))
    )
  
  #Checking non-binary factor variables
  sapply(
    cbo_intermediate[activism] %>% 
      na_if("Don't Know") %>% 
      drop_na() %>% 
      select_if(is.factor) %>% 
      select_if( ~ is.factor(.) && !(any(c("1") %in% levels(.)))),
    table
  )
  
  # Note # They are all multichoice factor variables that should have a binary variable associated with all possible choice
  # Should we check that the binary variables for these multichoice questions have been built well?
  
  ### Checking variable distributions: recip ###
  sumstats(cbo_intermediate[recip]  %>% na_if("Don't Know")) #One multichoice
  table(cbo_intermediate$recip_any)
  reciptochange <-
    cbo_intermediate[recip] %>% 
    select(-recip_any) %>% 
    select_if( ~ is.factor(.) && any(c("1") %in% levels(.))) %>% colnames
  
  cbo_intermediate <-
    cbo_intermediate %>%
    mutate(
      recip_type_availability = as.factor(as.character(ifelse(
        recip_any == 0, 0, as.character(recip_type_availability)
      ))),
      recip_type_permissions = as.factor(as.character(ifelse(
        recip_any == 0, 0, as.character(recip_type_permissions)
      ))),
      recip_type_funding = as.factor(as.character(ifelse(
        recip_any == 0, 0, as.character(recip_type_funding)
      ))),
      recip_type_recommendations = as.factor(as.character(ifelse(
        recip_any == 0, 0, as.character(recip_type_recommendations)
      ))),
      recip_type_responsiveness = as.factor(as.character(ifelse(
        recip_any == 0, 0, as.character(recip_type_responsiveness)
      ))),
      recip_type_invitations = as.factor(as.character(ifelse(
        recip_any == 0, 0, as.character(recip_type_invitations)
      ))),
      recip_type_mobilization = as.factor(as.character(ifelse(
        recip_any == 0, 0, as.character(recip_type_mobilization)
      ))),
      recip_type_dissemination = as.factor(as.character(ifelse(
        recip_any == 0, 0, as.character(recip_type_dissemination)
      ))),
      recip_type_information = as.factor(as.character(ifelse(
        recip_any == 0, 0, as.character(recip_type_information)
      ))),
      recip_type_documents = as.factor(as.character(ifelse(
        recip_any == 0, 0, as.character(recip_type_documents)
      ))),
      recip_type_other = as.factor(as.character(ifelse(
        recip_any == 0, 0, as.character(recip_type_other)
      ))),
    )
  
  ### Checking variable distributions: research ###
  sumstats(cbo_intermediate[research]) #All the factors are binary
  sumstats(cbo_intermediate[research] %>% na_if("Don't Know")) #A few don't know (they appear as NA here)

  # Note # Probably need to do something about free text entry variables if needed (e.g. research_opinion_detail)
  
  ### Checking variable distributions: organize ###
  sumstats(cbo_intermediate[organize])
  sumstats(cbo_intermediate[organize] %>% na_if("Don't Know")) #A few don't know
  
  # Note # Same issue as in previous varlists: _num variables are wrongly put as NA when the associated binary variable is 0.
  
  #Changing some NAs to 0 (putting them as 0 when the associated bin var is 0, leaving them as NA when "Don't Know")
  cbo_intermediate <-
    cbo_intermediate %>%
    dplyr::mutate(
      organize_villagemeeting_num = ifelse(organize_villagemeeting == 0, 0, organize_villagemeeting_num),
      organize_collab_num = ifelse(organize_collab == 0, 0, organize_collab_num),
      organize_stakeholder_num = ifelse(organize_stakeholder ==
                                          0, 0, organize_stakeholder_num)
    )
  
  reporting.changes(cbo_intermediate,
                    "organize_villagemeeting_num",
                    describe = "NA to 0 for CBOs due to 0 value in associated binary variable.")
  reporting.changes(cbo_intermediate,
                    "organize_collab_num",
                    describe = "NA to 0 for CBOs due to 0 value in associated binary variable.")
  reporting.changes(cbo_intermediate,
                    "organize_stakeholder_num",
                    describe = "NA to 0 for CBOs due to 0 value in associated binary variable.")
  
  sumstats(cbo_intermediate[organize] %>% na_if("Don't Know")) #A few don't know
  
  #Some very high values in the numeric variables: flagging those above 50
  # lotsoforgasvec <- cbo_intermediate[organize] %>%
  #   select_if(is.numeric) %>%
  #   filter_all(any_vars(. > 50)) %>% #Any CBO with at least one number of meeting over 75
  #   select_if( ~ max(.) > 50, na.rm = TRUE) %>% colnames #Any variable with max above 75
  # lotsoforgas <- cbo_intermediate %>%
  #   select(appcode, all_of(lotsoforgasvec)) %>% #Using id to identify observations
  #   filter(if_any(all_of(lotsoforgasvec), ~ .x > 50)) #Only keeping observations with at least one var in this case
  
  # for (i in unique(lotsoforgas$appcode)) {
  #   ##Used for loop inside of map because map only saves the last iteration for some reason
  #   idss <- paste0(i)
  #   flagging(
  #     df = cbo_intermediate,
  #     selected.variable = "organize_villagemeeting_num, organize_collab_num, organize_stakeholder_num",
  #     selected.id = idss,
  #     as.NA = FALSE,
  #     #Not turning into NA for now
  #     remarks = "High number of collab: observation with at least 50 collaborations in 2019 and 2020 for at least one category."
  #   )
  #   rm(idss)
  # }
  # rm(lotsoforgasvec, lotsoforgas)
  
  ### Random check of integrity of dataset ###
  if (nrow(cbo_intermediate_untouched) != nrow(cbo_intermediate))
    stop(
      "Something went wrong: some observations were dropped since the first creation of cbo_intermediate."
    )
  if (length(cbo_intermediate_untouched %>% colnames) > length(cbo_intermediate %>% colnames))
    stop(
      "Something went wrong: some variables were dropped since the first creation of cbo_intermediate."
    )
  ###                                      ####
  
  ### Checking variable distributions: pressure ###
  sumstats(cbo_intermediate[pressure])
  sumstats(cbo_intermediate[pressure] %>% na_if("Don't Know")) #1 don't know for pressure_campaign
  
  # Note # Same issue as in previous varlists: _num variables are wrongly put as NA when the associated binary variable is 0.
  #Changing some NAs to 0 (putting them as 0 when the associated bin var is 0, leaving them as NA when "Don't Know")
  cbo_intermediate <-
    cbo_intermediate %>%
    dplyr::mutate(
      pressure_campaign_num = ifelse(pressure_campaign == 0, 0, pressure_campaign_num),
      pressure_protest_num = ifelse(pressure_protest == 0, 0, pressure_protest_num)
    )
  
  reporting.changes(cbo_intermediate,
                    "pressure_campaign_num",
                    describe = "NA to 0 for CBOs due to 0 value in associated binary variable.")
  reporting.changes(cbo_intermediate,
                    "pressure_protest_num",
                    describe = "NA to 0 for CBOs due to 0 value in associated binary variable.")
  
  #Only 8 observations with number of outreach or information campaign above 10 (max is 48) - flagging them
  sumstats(cbo_intermediate$pressure_campaign_num)
  sum(cbo_intermediate$pressure_campaign_num > 10,
      na.rm = TRUE)
  sum(cbo_intermediate$pressure_campaign_num > 10,
      na.rm = TRUE) / length(cbo_intermediate$pressure_campaign_num)
  # 
  # lotsofpres <- cbo_intermediate %>%
  #   filter(pressure_campaign_num > 10) #Only keeping observations with at least one var in this case
  # 
  # for (i in unique(lotsofpres$appcode)) {
  #   ##Used for loop inside of map because map only saves the last iteration for some reason
  #   idss <- paste0(i)
  #   flagging(
  #     df = cbo_intermediate,
  #     selected.variable = "pressure_campaign_num",
  #     selected.id = idss,
  #     as.NA = FALSE,
  #     #Not turning into NA for now
  #     remarks = "High number of pressure_campaign: only 8 observations with at least 10 collaborations in 2019 and 2020 (max is 48)."
  #   )
  #   rm(idss)
  # }
  # rm(lotsofpres)
  
  ### Checking variable distributions: local_pol ###
  sumstats(cbo_intermediate[local_pol]) ##All characters
  
  ### Checking variable distributions: knowledge_local_pol ###
  sumstats(cbo_intermediate[knowledge_local_pol]) ##None missing - All factor except one numeric (most in the correct/incorrect format)
  sapply(cbo_intermediate[knowledge_local_pol], levels)
  
  ### Checking variable distributions: decision_ex (decision exercise) ###
  sumstats(cbo_intermediate[decision_ex])
  
  # Note # Will look at those later if necessary
  
  ### Random check of integrity of dataset ###
  if (nrow(cbo_intermediate_untouched) != nrow(cbo_intermediate))
    stop(
      "Something went wrong: some observations were dropped/added since the first creation of cbo_intermediate."
    )
  if (length(cbo_intermediate_untouched %>% colnames) > length(cbo_intermediate %>% colnames))
    stop(
      "Something went wrong: some variables were dropped since the first creation of cbo_intermediate."
    )
  ###                                      ####
  
  ### Saving cbo_intermediate (pre-preparation for analysis) cleaned CBO group dataset ###
  saveRDS(
    cbo_intermediate,
    file = here(
      datatype,
      "Intermediate",
      "CBO_groups_intermediate.RDS"
    )
  )
  
message("**01 completed")

######################################################################
#END#
